Basics Descriptives: Document Level

summary<-data_text %>%
  mutate(paragraphs_dropped = num_paragraphs_prespellcheck - num_paragraphs_postspellcheck)%>%
  group_by(Year)%>%
  summarize(mean_numwords = mean(numwords_r, na.rm=T), 
            N = n())
## `summarise()` ungrouping output (override with `.groups` argument)
summary %>%
  ggplot(aes(x=Year, y = mean_numwords)) +
  geom_line()+
  ggtitle("Number of words over time")

summary %>%
  ggplot(aes(x=Year, y = N)) +
  geom_line()+
  ggtitle("Number of docs over time")

Basics Descriptives: Future-Words

#group by decade, future words
v <- c(  'may', 'might', 'future', 'will','optimism', 'pessimism', 'uncertainty', 'certainty',  'outlook', 'risk', 'risky', 'optimistic')

decade_words<- tbl_tokens%>%
  group_by(decade)%>%
  count(word, sort = TRUE) 

decade_totals <- decade_words %>%
  group_by(decade)%>%
  summarize(total= sum(n))
## `summarise()` ungrouping output (override with `.groups` argument)
decade_words<- left_join(decade_words, decade_totals)
## Joining, by = "decade"
decade_words%>%
  filter(word %in% v) %>%
  mutate(percent = n/total) %>%
  ggplot(aes(decade, percent))+
  geom_line()+
  facet_wrap(~word, scales = "free")+
  ggtitle("Future words in text")

#tf-idf
decade_words_idf <- decade_words %>%
  bind_tf_idf(word, decade, n)

decade_words_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(decade) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = as.factor(decade))) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~decade, ncol = 4, scales = "free") +
  coord_flip()+
  ggtitle("TF-IDF By Decade")
## Selecting by tf_idf

Dictionary words

## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
##              used   (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells    2519242  134.6    4084261   218.2         NA    4084261   218.2
## Vcells 1010610231 7710.4 3497995719 26687.6     102400 4353357712 33213.5
## `summarise()` ungrouping output (override with `.groups` argument)

Dictionary Words, by Subgroup

## `summarise()` regrouping output by 'Year' (override with `.groups` argument)

## `summarise()` regrouping output by 'Year' (override with `.groups` argument)

## `summarise()` regrouping output by 'Year' (override with `.groups` argument)

Words changing over time

## Joining, by = "word"

## Joining, by = "word"

## Joining, by = "word"

## Joining, by = "word"

## Joining, by = "word"

## Joining, by = "word"

## Joining, by = "word"

Readability

## Package version: 2.0.1
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View

#Number words

## `summarise()` ungrouping output (override with `.groups` argument)

STM Topics

Main Topics: From 5 Topic Solution

## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "topic"

Main Topics: From 5 Topic Solution, by Group

## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
## Joining, by = "topic"

## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
## Joining, by = "topic"

#Lexicon analysis by topics, MEAN SPLIT SCORES

## Joining, by = "filename"
##              used    (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells    2862908   152.9    5300880   283.1         NA    5300880   283.1
## Vcells 1318324636 10058.1 2798396576 21350.1     102400 4353357712 33213.5
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)

STM: Number words (MEAN SPLIT SCORES)

#merge in numbers
numbers_topic<- numbers %>%
  left_join(topic_df)
## Joining, by = "filename"
#wide to long
numbers_topics_long <- numbers_topic %>%
  gather(key = topic, value = value, t0.mean.d: t4.mean.d)%>%
  filter(value ==1)

#means
number_topics_means<- numbers_topics_long %>%
  group_by(Year, topic) %>%
  summarize(mean_per_numbers = mean(percent_numbers, na.rm=T))
## `summarise()` regrouping output by 'Year' (override with `.groups` argument)
#plot
number_topics_means%>%
  ggplot(aes(Year, mean_per_numbers, group = topic))+
  geom_point()+
  xlim(1930, 2005)+
  facet_wrap(~topic)+
  ylim(0, 0.075)+
  ggtitle("Number words over time")